/******************************************************************************
This file cleans the baseline test score data.
--------------------------------------------------------------------------------
Input:			Raw 3rd - 8th grade standardized test data sets
					> {yyyy}-{yy}_Student_test-Biog_All_G38_NYC_Scrambled.dta

Output:			Cleaned 3rd - 8th grade standardized test data sets
					> "${cleandata}test-Biog_appended.dta"
--------------------------------------------------------------------------------*/

	clear all
	local append 1

// Append all files
if `append' == 1 {
	set obs 1
	gen  year = .
	tempfile append
	sa `append'

	// Loop over years
	forval yr = 2005/2018 {

		local digits_plus = `yr' - 2000 + 1
		if length("`digits_plus'") == 1 local digits_plus 0`digits_plus'
		local next_year = `yr' +  1

		// Load raw file
		use "${cleandata}outcomes/TestBiog/`yr'-`digits_plus'_Student_test-Biog_All_G38_NYC_Scrambled", clear

		*if `yr' >= 2017{
		destring *grade *level *score, replace force
		*}
		* Duplicate
		if `yr' ==2017{
			drop if student_id_scram =="345022136" & math_perf_level == 2
		}

		cap destring *_raw_score, replace force
		destring *_test_grade, replace
		destring *_scale_score, replace

		// Generate spring year indicator
		gen bl_exam_year = `next_year'

		append using `append'

		sa `append', replace

	}

	keep *_raw_score *_test_grade bl_exam_year stu *_scale_score

	// Reshape long by test x student x year

	reshape long  @_raw_score @_test_grade @_scale_score, i(student_id  bl_exam_year) j(subject) string
	 ren _* *

	* Harmonize variable names
	ren student_id_scram stu

	sa "${cleandata}baseline_scores_standardized_all_appended.dta", replace

}

use "${cleandata}baseline_scores_standardized_all_appended.dta", clear

********************************************************************
******** Keeping tests for xth grade
********************************************************************

foreach grade in 3 5 {
	preserve
		// Keep if grade matches and score is not missing
		keep if test_grade == `grade'
		drop if raw_score == . & bl_exam_year <= 2017

		* Use the most recent attempt
		bys stu subject: egen max_year = max( bl_exam_year )
		drop if bl_exam_year != max_year
		drop max_year

		* Standardize by year
		bys subject bl_exam_year test_grade: egen mean = mean(scale_score)
		bys subject bl_exam_year test_grade: egen sd = sd(scale_score)
		gen bl_ss  = (scale_score - mean ) / sd
		drop mean sd

		replace subject = "_" + subject

		* Reshape wide
		reshape wide bl_exam_year@ test_grade@ raw_score@ bl_ss@ scale_score@, i(stu) j(subject) string

		* get a total score
		g scale_score_total = scale_score_math + scale_score_ela
		g bl_exam_year_total = bl_exam_year_math if bl_exam_year_math == bl_exam_year_ela
		g test_grade_total = test_grade_math if test_grade_math == test_grade_ela

		* standardize the total score
		bys bl_exam_year_total test_grade_total: egen mean = mean(scale_score_total)
		bys bl_exam_year_total test_grade_total: egen sd = sd(scale_score_total)
		g bl_total  = (scale_score_total - mean ) / sd
		drop mean sd

		sa "${cleandata}baseline_scores_standardized_grade_ms_`grade'.dta", replace

	restore
}
